{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Deploy the XGBoost Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q boto3\n",
    "!pip install -q xgboost==0.90"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import sagemaker\n",
    "import pandas as pd\n",
    "\n",
    "sess   = sagemaker.Session()\n",
    "bucket = sess.default_bucket()\n",
    "role = sagemaker.get_execution_role()\n",
    "region = boto3.Session().region_name\n",
    "\n",
    "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store -r spark_processing_job_s3_output_prefix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print('Previous Spark Processing Job Name: {}'.format(spark_processing_job_s3_output_prefix))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Specify the S3 Location of the Features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle as pkl\n",
    "\n",
    "model_dir = './models/built-in'\n",
    "model_path = os.path.join(model_dir, 'xgboost-model')\n",
    "\n",
    "xgb_estimator_restored = pkl.load(open(model_path, 'rb'))\n",
    "\n",
    "type(xgb_estimator_restored)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  Calculate Test Metrics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download Test Dataset and Load into Memory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "prefix_train = '{}/output/tfidf-labeled-split-balanced-noheader-train'.format(spark_processing_job_s3_output_prefix)\n",
    "prefix_validation = '{}/output/tfidf-labeled-split-balanced-noheader-validation'.format(spark_processing_job_s3_output_prefix)\n",
    "prefix_test = '{}/output/tfidf-labeled-split-balanced-noheader-test'.format(spark_processing_job_s3_output_prefix)\n",
    "\n",
    "balanced_tfidf_without_header_train_path = './{}'.format(prefix_train)\n",
    "balanced_tfidf_without_header_validation_path = './{}'.format(prefix_validation)\n",
    "balanced_tfidf_without_header_test_path = './{}'.format(prefix_test)\n",
    "\n",
    "balanced_tfidf_without_header_train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)\n",
    "balanced_tfidf_without_header_validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)\n",
    "balanced_tfidf_without_header_test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)\n",
    "\n",
    "s3_input_train_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_train_s3_uri, content_type='text/csv')\n",
    "s3_input_validation_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_validation_s3_uri, content_type='text/csv')\n",
    "s3_input_test_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_test_s3_uri, content_type='text/csv')\n",
    "\n",
    "print(s3_input_train_data.config)\n",
    "print(s3_input_validation_data.config)\n",
    "print(s3_input_test_data.config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.makedirs(prefix_test, exist_ok=True)\n",
    "balanced_tfidf_without_header_test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)\n",
    "balanced_tfidf_without_header_test_path = './{}'.format(prefix_test)\n",
    "\n",
    "print(balanced_tfidf_without_header_test_s3_uri)\n",
    "print(balanced_tfidf_without_header_test_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws s3 ls --recursive $balanced_tfidf_without_header_test_s3_uri"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!aws s3 cp --recursive $balanced_tfidf_without_header_test_s3_uri $balanced_tfidf_without_header_test_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "import pandas as pd\n",
    "\n",
    "def load_dataset(path, sep, header):\n",
    "    data = pd.concat([pd.read_csv(f, sep=sep, header=header) for f in glob.glob('{}/*.csv'.format(path))], ignore_index = True)\n",
    "\n",
    "    labels = data.iloc[:,0]\n",
    "    features = data.drop(data.columns[0], axis=1)\n",
    "    \n",
    "    if header==None:\n",
    "        # Adjust the column names after dropped the 0th column above\n",
    "        # New column names are 0 (inclusive) to len(features.columns) (exclusive)\n",
    "        new_column_names = list(range(0, len(features.columns)))\n",
    "        features.columns = new_column_names\n",
    "\n",
    "    return features, labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test, y_test = load_dataset(path=balanced_tfidf_without_header_test_path, sep=',', header=None)\n",
    "X_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "X_test.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Deploy Endpoint"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### From an external application, you can use the following code to make a prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import sagemaker\n",
    "import pandas as pd\n",
    "\n",
    "sess   = sagemaker.Session()\n",
    "bucket = sess.default_bucket()\n",
    "role = sagemaker.get_execution_role()\n",
    "region = boto3.Session().region_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "# https://towardsdatascience.com/xgboost-in-amazon-sagemaker-28e5e354dbcd\n",
    "from sagemaker.predictor import csv_serializer\n",
    "\n",
    "xgb_endpoint_name = 'xgboost-built-in-{}'.format(time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime()))\n",
    "xgb_endpoint_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO:  Set XGBoostPredictor before deploying"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO:  1) update this to do TF/IDF\n",
    "#        2) use this in other versions of the model\n",
    "# Derived from the following:\n",
    "#   https://aim357.readthedocs.io/en/latest/GluePySparkMLFeatureEngineering/GluePySparkMLFeatureEngineering.html#deepar-deep-dive\n",
    "\n",
    "class XGBoostPredictor(sagemaker.predictor.RealTimePredictor):\n",
    "\n",
    "    def __init__(self, *args, **kwargs):\n",
    "        super().__init__(*args, content_type=sagemaker.content_types.CONTENT_TYPE_JSON, **kwargs)\n",
    "\n",
    "    def predict(self, ts, cat=None, dynamic_feat=None,\n",
    "                num_samples=100, return_samples=False, quantiles=[\"0.1\", \"0.5\", \"0.9\"]):\n",
    "        \"\"\"Requests the prediction of for the time series listed in `ts`, each with the (optional)\n",
    "        corresponding category listed in `cat`.\n",
    "\n",
    "        ts -- `pandas.Series` object, the time series to predict\n",
    "        cat -- integer, the group associated to the time series (default: None)\n",
    "        num_samples -- integer, number of samples to compute at prediction time (default: 100)\n",
    "        return_samples -- boolean indicating whether to include samples in the response (default: False)\n",
    "        quantiles -- list of strings specifying the quantiles to compute (default: [\"0.1\", \"0.5\", \"0.9\"])\n",
    "\n",
    "        Return value: list of `pandas.DataFrame` objects, each containing the predictions\n",
    "        \"\"\"\n",
    "        prediction_time = ts.index[-1] + 1\n",
    "        quantiles = [str(q) for q in quantiles]\n",
    "        req = self.__encode_request(ts, cat, dynamic_feat, num_samples, return_samples, quantiles)\n",
    "        res = super(DeepARPredictor, self).predict(req)\n",
    "        return self.__decode_response(res, ts.index.freq, prediction_time, return_samples)\n",
    "\n",
    "    def __encode_request(self, ts, cat, dynamic_feat, num_samples, return_samples, quantiles):\n",
    "        instance = series_to_dict(ts, cat if cat is not None else None, dynamic_feat if dynamic_feat else None)\n",
    "\n",
    "        configuration = {\n",
    "            \"num_samples\": num_samples,\n",
    "            \"output_types\": [\"quantiles\", \"samples\"] if return_samples else [\"quantiles\"],\n",
    "            \"quantiles\": quantiles\n",
    "        }\n",
    "\n",
    "        http_request_data = {\n",
    "            \"instances\": [instance],\n",
    "            \"configuration\": configuration\n",
    "        }\n",
    "\n",
    "        return json.dumps(http_request_data).encode('utf-8')\n",
    "\n",
    "    def __decode_response(self, response, freq, prediction_time, return_samples):\n",
    "        # we only sent one time series so we only receive one in return\n",
    "        # however, if possible one will pass multiple time series as predictions will then be faster\n",
    "        predictions = json.loads(response.decode('utf-8'))['predictions'][0]\n",
    "        prediction_length = len(next(iter(predictions['quantiles'].values())))\n",
    "        prediction_index = pd.DatetimeIndex(start=prediction_time, freq=freq, periods=prediction_length)\n",
    "        if return_samples:\n",
    "            dict_of_samples = {'sample_' + str(i): s for i, s in enumerate(predictions['samples'])}\n",
    "        else:\n",
    "            dict_of_samples = {}\n",
    "        return pd.DataFrame(data={**predictions['quantiles'], **dict_of_samples}, index=prediction_index)\n",
    "\n",
    "    def set_frequency(self, freq):\n",
    "        self.freq = freq\n",
    "\n",
    "def encode_target(ts):\n",
    "    return [x if np.isfinite(x) else \"NaN\" for x in ts]\n",
    "\n",
    "def series_to_dict(ts, cat=None, dynamic_feat=None):\n",
    "    \"\"\"Given a pandas.Series object, returns a dictionary encoding the time series.\n",
    "\n",
    "    ts -- a pands.Series object with the target time series\n",
    "    cat -- an integer indicating the time series category\n",
    "\n",
    "    Return value: a dictionary\n",
    "    \"\"\"\n",
    "    obj = {\"start\": str(ts.index[0]), \"target\": encode_target(ts)}\n",
    "    if cat is not None:\n",
    "        obj[\"cat\"] = cat\n",
    "    if dynamic_feat is not None:\n",
    "        obj[\"dynamic_feat\"] = dynamic_feat\n",
    "    return obj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_endpoint_name = prefix + time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())\n",
    "\n",
    "xgb_predictor = xgb_estimator.deploy(\n",
    "                     initial_instance_count=1, \n",
    "                     instance_type='ml.m4.xlarge',\n",
    "#                     predictor_cls=XGBoostPredictor,\n",
    "                     endpoint_name=xgb_endpoint_name)\n",
    "\n",
    "#xgb_predictor.content_type = 'text/csv'\n",
    "#xgb_predictor.serializer = csv_serializer\n",
    "#xgb_predictor.deserializer = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix\n",
    "\n",
    "sm_runtime = boto3.client('sagemaker-runtime')\n",
    "\n",
    "payload_500_samples = X_test[:500].to_csv(index=False, header=False).rstrip()\n",
    "\n",
    "response_500_samples = sm_runtime.invoke_endpoint(\n",
    "    EndpointName=xgb_endpoint_name,\n",
    "    Body=payload.encode('utf-8'),\n",
    "    ContentType='text/csv')['Body'].read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions_500_samples = np.fromstring(response_500_samples, sep=',')\n",
    "predictions_500_samples_0_or_1 = np.where(predictions_500_samples > 0.5, 1, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Test Accuracy: ', accuracy_score(y_test[:500], predictions_500_samples_0_or_1))\n",
    "print('Test Precision: ', precision_score(y_test[:500], predictions_500_samples_0_or_1, average=None))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sn\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "df_cm_test = confusion_matrix(y_test[:500], predictions_500_samples_0_or_1)\n",
    "df_cm_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):\n",
    "    print(cm)\n",
    "    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
    "    plt.title(title)\n",
    "    plt.colorbar()\n",
    "    tick_marks = np.arange(len(classes))\n",
    "    plt.xticks(tick_marks, classes, rotation=45)\n",
    "    plt.yticks(tick_marks, classes)\n",
    "\n",
    "    fmt = 'd'\n",
    "    thresh = cm.max() / 2.\n",
    "    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
    "        plt.text(j, i, format(cm[i, j], fmt),\n",
    "        horizontalalignment=\"center\",\n",
    "        color=\"black\" if cm[i, j] > thresh else \"black\")\n",
    "\n",
    "        plt.tight_layout()\n",
    "        plt.ylabel('True label')\n",
    "        plt.xlabel('Predicted label')\n",
    "\n",
    "# Plot non-normalized confusion matrix\n",
    "plt.figure()\n",
    "fig, ax = plt.subplots(figsize=(6,4))\n",
    "plot_conf_mat(df_cm_test, classes=['Not Positive Sentiment', 'Positive Sentiment'], \n",
    "                          title='Confusion matrix')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "auc = round(metrics.roc_auc_score(y_test, preds_test), 4)\n",
    "print('AUC is ' + repr(auc))\n",
    "\n",
    "fpr, tpr, _ = metrics.roc_curve(y_test, preds_test)\n",
    "\n",
    "plt.title('ROC Curve')\n",
    "plt.plot(fpr, tpr, 'b',\n",
    "label='AUC = %0.2f'% auc)\n",
    "plt.legend(loc='lower right')\n",
    "plt.plot([0,1],[0,1],'r--')\n",
    "plt.xlim([-0.1,1.1])\n",
    "plt.ylim([-0.1,1.1])\n",
    "plt.ylabel('True Positive Rate')\n",
    "plt.xlabel('False Positive Rate')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions, raw_outputs = model.predict([\"\"\"Very funny. A typical mid 50's comedy.\"\"\"])\n",
    "print('Predictions: {}'.format(predictions))\n",
    "print('Raw outputs: {}'.format(raw_outputs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions, raw_outputs = bert_model.predict([\"\"\"That movie was absolutely awful.\"\"\"])\n",
    "print('Predictions: {}'.format(predictions))\n",
    "print('Raw outputs: {}'.format(raw_outputs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
